1 Introduction

In this notebook, we will annotate the cells in our tonsil atlas (level 1) using the markers we found for each cluster. Importantly, we will split or merge different clusters to group cells into biologically sound cell types and states.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(SeuratWrappers)
library(harmony)
library(tidyverse)

2.2 Parameters

# Paths
path_to_obj <- here::here("scRNA-seq/results/R_objects/tonsil_rna_integrated_clustered_level_1.rds")
path_to_markers <- here::here("scRNA-seq/3-clustering/1-level_1/tmp/markers_level_1")
path_to_save <- here::here("scRNA-seq/results/R_objects/tonsil_rna_integrated_annotated_level_1.rds")
path_to_save_df <- here::here("scRNA-seq/3-clustering/1-level_1/tmp/annotation_level_1_multiome.rds")


# Functions
source(here::here("scRNA-seq/bin/utils.R"))

2.3 Load data

# Seurat object
tonsil <- readRDS(path_to_obj)
tonsil
## An object of class Seurat 
## 37378 features across 362395 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony
# Markers
markers_rds <- list.files(path_to_markers)
markers_paths <- str_c(path_to_markers, markers_rds, sep = "/")
markers_dfs <- purrr::map(markers_paths, readRDS)
names(markers_dfs) <- markers_rds %>%
  str_remove("^markers_cluster_") %>%
  str_remove("_level_1.rds")

3 Explore markers

This is the current clustering:

table(tonsil$seurat_clusters) / ncol(tonsil) * 100
## 
##          0          1          2          3          4          5          6          7          8          9         10         11         12 
## 31.0305054 23.0375695 16.1103216 13.8357317  3.5651706  3.5063950  3.4547938  1.6854537  1.3692242  0.9241297  0.8427269  0.3785924  0.2593855
colors <- c("#5A5156", "#E4E1E3", "#F6222E", "#FE00FA", "#16FF32", "#3283FE",
            "#FEAF16", "#B00068", "#1CFFCE", "#90AD1C", "#2ED9FF", "#DEA0FD",
            "#AA0DFE")
p <- DimPlot(
  tonsil,
  group.by = "seurat_clusters",
  pt.size = 0.1,
  cols = colors
)
p

# Stratified by age
umap_age <- plot_split_umap(tonsil, "age_group")
umap_age

3.1 Table

sorted_names <- names(markers_dfs) %>%
  as.numeric() %>%
  sort() %>%
  as.character()
markers_dfs <- markers_dfs[sorted_names]
markers_dfs2 <- purrr::map(names(markers_dfs), function(cluster) {
  df <- markers_dfs[[cluster]] %>%
    rownames_to_column(var = "gene") %>%
    filter(p_val_adj < 0.001 & avg_logFC > 0.75) %>% 
    mutate(cluster = cluster) %>%
    arrange(desc(avg_logFC))
  df
})
names(markers_dfs2) <- names(markers_dfs)
markers_df <- bind_rows(markers_dfs2)
DT::datatable(markers_df)

Notably, cluster 8 and 10 do not seem to have any distinctive marker:

DT::datatable(markers_dfs[["8"]])
DT::datatable(markers_dfs[["10"]])

3.2 UMAPs

We begin by visualizing well-known lineage-specific markers:

canonical_markers <- c("CD79A", "CD79B", "CD3D", "CD3E", "NKG7", "LYZ",
                       "IGHD", "IGHM", "IGHA1", "IGHG1", "FDCSP", "PTCRA",
                       "XBP1", "TOP2A", "KRT19", "SPRR3", "DNTT", "VPREB1")
canonical_markers_gg <- purrr::map(canonical_markers, function(x) {
  p <- feature_plot_doublets(seurat_obj = tonsil, feature = x)
  p
})
names(canonical_markers_gg) <- canonical_markers
canonical_markers_gg
## $CD79A

## 
## $CD79B

## 
## $CD3D

## 
## $CD3E

## 
## $NKG7

## 
## $LYZ

## 
## $IGHD

## 
## $IGHM

## 
## $IGHA1

## 
## $IGHG1

## 
## $FDCSP

## 
## $PTCRA

## 
## $XBP1

## 
## $TOP2A

## 
## $KRT19

## 
## $SPRR3

## 
## $DNTT

## 
## $VPREB1

Let us now visualize a subset of top markers for each cluster:

selected_markers <- list(
  "0" = c("BANK1", "FCER2", "MARCH1", "CD83"),
  "1" = c("CD3D", "IL7R", "CD2", "GIMAP7"),
  "2" = c("TOP2A", "MKI67", "CENPF", "HMGB1"),
  "3" = c("MARCKSL1", "RGS13", "CCDC88A", "LMO2"),
  "4" = c("GNLY", "NKG7", "GZMK", "CD8A"),
  "5" = c("FCRL4", "FCRL5", "PLAC8", "SOX5"),
  "6" = c("IGHG1", "IGHA1", "JCHAIN", "XBP1"),
  "7" = c("LYZ", "S100A8", "APOE", "AIF1"),
  "8" = c("TXNIP", "RPS10", "TRBC2", "TCF7"),
  "9" = c("FDCSP", "CLU", "CXCL13", "CR2"),
  "10" = c("MT2A", "CD3D", "TRAC", "PCNA"),
  "11" = c("PTPRCAP", "CD37", "HSPA1B", "CD74"),
  "12" = c("PTCRA", "LILRA4", "CLECL4C", "IRF7")
)
purrr::map(selected_markers, function(l) {
  FeaturePlot(tonsil, features = l, ncol = 2, pt.size = 0.1)
})
## $`0`

## 
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

## 
## $`11`

## 
## $`12`

Cell cycle scores:

s_gg <- feature_plot_doublets(seurat_obj = tonsil, feature = "S.Score")
g2m_gg <- feature_plot_doublets(seurat_obj = tonsil, feature = "G2M.Score")
s_gg

g2m_gg

4 Preliminary annotation

Cluster ID Markers Cell Type
0 BANK1, FCER2 naive and memory B
1 CD3D, IL7R CD4+ T
2 MKI67, TOP2A DZ GC B
3 MARCKSL1, RGS13, LMO2, CCDC88A LZ GC B
4 GNLY, NKG7, GZMK, CD8A Cytotoxic
5 FCRL4, FCRL5, PLAC8, SOX5 FCRL4+ memory B
6 IGHG1, IGHA1, JCHAIN, XBP1 PC
7 LYZ, S100A8 myeloid
8 Poorly defined Poor-quality/doublets
9 FDCSP, CLU, CXCL13, CR2 FDC
10 MT2A, CD3D, TRAC, PCNA doublets/proliferative T cells
11 PTPRCAP, CD37, HSPA1, CD74 Unknown
12 PTCRA, LILRA4, CLECL4C, IRF7 PDC

Cluster 8 seems to be composed of poor-quality cells (lysed cells, empty droplets or doublets), because it does not possess any distinctive marker. On the other hand, cluster 10 might be composed of proliferative T cells, as it shows cell cycle and T-cell markers. However, this cluster appears at the zone of the germinal center, so it could also be a cluster of doublets. To shed light into these two points, let us plot the number of genes and the proportion of doublet nearest neighbors (pDNN) across clusters:

VlnPlot(
  tonsil,
  features = c("nFeature_RNA", "pDNN_union"),
  group.by = "seurat_clusters",
  cols = colors,
  pt.size = 0
)

Interestingly, both cluster 8 and 10 have a high pDNN score. All in all, we will remove cluster 8 (clearly technical) and keep cluster 10. We will include it together with the T-cells and decide at the level 2 whether it is technical or biological.

tonsil
## An object of class Seurat 
## 37378 features across 362395 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony
tonsil <- subset(tonsil, subset = seurat_clusters != "8")
tonsil
## An object of class Seurat 
## 37378 features across 357433 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony

In addition, we know from previous analysis cluster 11 is composed of cells from the dataset of King et al.. Let us plot their annotation:

umap_df <- tonsil@reductions$umap@cell.embeddings %>%
  as.data.frame() %>%
  mutate(
    cell_type = tonsil$cell_type,
    assay = tonsil$assay,
    seurat_clusters = tonsil$seurat_clusters
  ) %>%
  filter(seurat_clusters == "11", assay == "5P")
umap_cluster_11 <- umap_df %>%
  ggplot(aes(UMAP_1, UMAP_2, color = cell_type)) +
    geom_point() +
    theme_classic()
umap_cluster_11

As we can see, this cluster contains both naive and memory B cells. Importantly, its top marker gene is PTPRCAP, which is a marker of B-cell activation. Following the same strategy as for cluster 10, we will keep them and decide their annotation in level 2, as we will have better resolution.

5 Subset and recluster

We know from exploratory analysis that a subset of cells within the cluster “myeloid” (7) are actually epithelial cells (they express keratins). Since they are conceptually very different, let us annotate them in a separate cluster:

myeloid <- subset(tonsil, idents = "7")
myeloid <- myeloid %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA() %>%
  RunHarmony(group.by.vars = "gem_id", dims = 1:30) %>%
  RunUMAP(dims = 1:30, reduction = "harmony")
FeaturePlot(
  myeloid,
  features = c("KRT19", "KRT78", "KRT13", "PERP"),
  reduction = "umap",
  ncol = 2
)

myeloid <- FindNeighbors(myeloid, dims = 1:30, reduction = "harmony")
myeloid <- FindClusters(myeloid, resolution = 0.1)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 6108
## Number of edges: 239050
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9662
## Number of communities: 13
## Elapsed time: 0 seconds
DimPlot(myeloid, reduction = "umap")

clusters_of_interest <- c("4", "10")
markers_epithelial <- purrr::map(clusters_of_interest, function(x) {
  df <- FindMarkers(
    myeloid,
    ident.1 = x,
    only.pos = TRUE,
    test.use = "wilcox",
    verbose = TRUE
  )
  df <- df %>%
    rownames_to_column(var = "gene") %>%
    filter(p_val_adj < 0.001 & avg_logFC > 1) %>%
    arrange(desc(avg_logFC))
  df
})
epithelial_markers_4 <- markers_epithelial[[1]]
epithelial_markers_10 <- markers_epithelial[[2]]
DT::datatable(epithelial_markers_4)
DT::datatable(epithelial_markers_10)

We can clearly see that subclusters 4 and 10 are epithelial. Thus, we will separate them from the myeloid.

Now, let us follow a similar strategy to “fetch” pre-B cells in cluster 12:

pDC <- subset(tonsil, idents = "12")
pDC <- pDC %>%
  FindVariableFeatures() %>%
  ScaleData() %>%
  RunPCA() %>%
  RunHarmony(group.by.vars = "gem_id", dims = 1:30) %>%
  RunUMAP(dims = 1:30, reduction = "harmony")
FeaturePlot(
  pDC,
  features = c("VPREB1", "DNTT", "CD3D", "LILRA4"),
  reduction = "umap",
  ncol = 2
)

Interestingly enough, we see a small cluster of what could be precursor T-cells, since they express CD3D and DNTT. Let us cluster them and find their markers:

pDC <- FindNeighbors(pDC, k.param = 6)
pDC <- FindClusters(pDC, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 940
## Number of edges: 10980
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8951
## Number of communities: 12
## Elapsed time: 0 seconds
DimPlot(pDC, reduction = "umap")

clusters_of_interest2 <- c("5", "11")
markers_preB_preT <- purrr::map(clusters_of_interest2, function(x) {
  df <- FindMarkers(
    pDC,
    ident.1 = x,
    only.pos = TRUE,
    test.use = "wilcox",
    verbose = TRUE
  )
  df <- df %>%
    rownames_to_column(var = "gene") %>%
    filter(p_val_adj < 0.001)
  df
})
preB_markers <- markers_preB_preT[[1]]
preT_markers <- markers_preB_preT[[2]]
DT::datatable(preB_markers)
DT::datatable(preT_markers)

We can clearly see that subcluster 5 and 11 correspond to pre-B and pre-T cells, respectively. One explanation for this finding might be that pDCs develop from lymphoid progenitors.

6 Final annotation

As we commented in the introduction, we will group similar clusters together. For instance, we will merge FCRL4+ memory, naive and memory B cells in a single cluster. As we follow a top-down approach, at level 2 we will have excluded all the variability related with other lineages, so we will gain much more resolution.

We will try to keep the same nomenclature the group used in previous publications, like this one:

Cluster ID Markers Cell Type
0 BANK1, FCER2 NBC and MBC
1 CD3D, IL7R CD4 T
2 MKI67, TOP2A GCBC
3 MARCKSL1, RGS13, LMO2, CCDC88A GCBC
4 GNLY, NKG7, GZMK, CD8A Cytotoxic
5 FCRL4, FCRL5, PLAC8, SOX5 NBC and MBC
6 IGHG1, IGHA1, JCHAIN, XBP1 PC
7 LYZ, S100A8 myeloid
8 Poorly defined excluded
9 FDCSP, CLU, CXCL13, CR2 FDC
10 MT2A, CD3D, TRAC, PCNA CD4 T
11 PTPRCAP, CD37, HSPA1, CD74 NBC and MBC
12 PTCRA, LILRA4, CLECL4C, IRF7 PDC
# Include the annotation in the metadata
tonsil$annotation_level_1 <- tonsil$seurat_clusters
annotation <- c("NBC_MBC", "CD4_T", "GCBC", "GCBC", "Cytotoxic", "NBC_MBC", "PC",
                "myeloid", "excluded", "FDC", "CD4_T", "NBC_MBC", "PDC")
levels(tonsil$annotation_level_1) <- annotation
tonsil$annotation_level_1 <- as.character(tonsil$annotation_level_1)


# Assign epithelial and pre-B-cell annotation
epithelial_cells <- colnames(myeloid)[myeloid$seurat_clusters %in% c("4", "10")]
preB_cells <- colnames(pDC)[pDC$seurat_clusters == "5"]
preT_cells <- colnames(pDC)[pDC$seurat_clusters == "11"]
tonsil$annotation_level_1[colnames(tonsil) %in% epithelial_cells] <- "epithelial"
tonsil$annotation_level_1[colnames(tonsil) %in% preB_cells] <- "preBC"
tonsil$annotation_level_1[colnames(tonsil) %in% preT_cells] <- "preTC"

Let us plot the annotation:

colors <- c("#5A5156", "#E4E1E3", "#F6222E", "#FE00FA", "#16FF32", "#3283FE",
            "#FEAF16", "#B00068", "#1CFFCE", "#90AD1C", "pink")
sorted_levels <- c("NBC_MBC", "GCBC", "PC", "preBC", "CD4_T", "preTC",
                   "Cytotoxic", "myeloid", "FDC", "PDC", "epithelial")
tonsil$annotation_level_1 <- factor(tonsil$annotation_level_1, sorted_levels)
p_final <- DimPlot(
  tonsil,
  group.by = "annotation_level_1",
  pt.size = 0.1,
  cols = colors
)
p_final

7 Save

saveRDS(tonsil, path_to_save)


# Save annotation for multiome cells, which we will later transfer to ATAC-seq
# cells
annotation_multiome_df <- tonsil@meta.data %>%
  rownames_to_column(var = "cell_barcode") %>%
  dplyr::filter(assay == "multiome") %>%
  dplyr::select("cell_barcode", "annotation_level_1")
saveRDS(annotation_multiome_df, path_to_save_df)

8 Session Information

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux Server release 6.7 (Santiago)
## 
## Matrix products: default
## BLAS:   /apps/R/3.6.0/lib64/R/lib/libRblas.so
## LAPACK: /home/devel/rmassoni/anaconda3/lib/libmkl_rt.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] forcats_0.5.0        stringr_1.4.0        dplyr_1.0.4          purrr_0.3.4          readr_1.3.1          tidyr_1.1.0          tibble_3.0.1         ggplot2_3.3.0        tidyverse_1.3.0      harmony_1.0          Rcpp_1.0.6           SeuratWrappers_0.2.0 Seurat_3.2.0         BiocStyle_2.14.4    
## 
## loaded via a namespace (and not attached):
##   [1] Rtsne_0.15            colorspace_1.4-1      deldir_0.1-25         ellipsis_0.3.1        ggridges_0.5.2        rprojroot_1.3-2       fs_1.4.1              rstudioapi_0.11       spatstat.data_1.4-3   farver_2.0.3          leiden_0.3.3          listenv_0.8.0         remotes_2.2.0         DT_0.17               ggrepel_0.8.2         RSpectra_0.16-0       fansi_0.4.1           lubridate_1.7.8       xml2_1.3.2            codetools_0.2-16      splines_3.6.0         knitr_1.28            polyclip_1.10-0       jsonlite_1.7.2        broom_0.5.6           ica_1.0-2             cluster_2.1.0         dbplyr_1.4.4          png_0.1-7             uwot_0.1.8            shiny_1.4.0.2         sctransform_0.2.1     BiocManager_1.30.10   compiler_3.6.0        httr_1.4.2            backports_1.1.7       assertthat_0.2.1      Matrix_1.2-18         fastmap_1.0.1         lazyeval_0.2.2        limma_3.42.2          cli_2.0.2             later_1.0.0           htmltools_0.5.1.1     tools_3.6.0           rsvd_1.0.3            igraph_1.2.5          gtable_0.3.0          glue_1.4.1            RANN_2.6.1            reshape2_1.4.4        rappdirs_0.3.1        spatstat_1.64-1       cellranger_1.1.0     
##  [55] vctrs_0.3.6           ape_5.3               nlme_3.1-148          crosstalk_1.1.0.1     lmtest_0.9-37         xfun_0.14             globals_0.12.5        rvest_0.3.5           mime_0.9              miniUI_0.1.1.1        lifecycle_0.2.0       irlba_2.3.3           goftest_1.2-2         future_1.17.0         MASS_7.3-51.6         zoo_1.8-8             scales_1.1.1          hms_0.5.3             promises_1.1.0        spatstat.utils_1.17-0 parallel_3.6.0        RColorBrewer_1.1-2    yaml_2.2.1            reticulate_1.16       pbapply_1.4-2         gridExtra_2.3         rpart_4.1-15          stringi_1.4.6         rlang_0.4.10          pkgconfig_2.0.3       evaluate_0.14         lattice_0.20-41       ROCR_1.0-11           tensor_1.5            labeling_0.3          patchwork_1.0.0       htmlwidgets_1.5.1     cowplot_1.0.0         tidyselect_1.1.0      here_0.1              RcppAnnoy_0.0.16      plyr_1.8.6            magrittr_1.5          bookdown_0.19         R6_2.4.1              generics_0.0.2        DBI_1.1.0             withr_2.4.1           pillar_1.4.4          haven_2.3.1           mgcv_1.8-31           fitdistrplus_1.1-1    survival_3.1-12       abind_1.4-5          
## [109] future.apply_1.5.0    modelr_0.1.8          crayon_1.3.4          KernSmooth_2.23-17    plotly_4.9.2.1        rmarkdown_2.2         viridis_0.5.1         readxl_1.3.1          grid_3.6.0            data.table_1.12.8     blob_1.2.1            reprex_0.3.0          digest_0.6.20         xtable_1.8-4          httpuv_1.5.3.1        munsell_0.5.0         viridisLite_0.3.0